Clustering of GBS markers to linkage groups


preliminary analysis:

Inspect missings in markers and individuals

missing data per individual

#missing data per marker
mrk <- read.csv2("LM3_F7_K.MARKER.genoSUMMARY.csv", header = T, sep = ",")

head(mrk)
##                   marker  AX EX NA. tot
## 1 Peex113Ctg00004_119009 107 54  34 195
## 2 Peex113Ctg00043_151704 105 58  32 195
## 3  Peex113Ctg00050_84548 101 51  43 195
## 4  Peex113Ctg00050_84557 103 52  40 195
## 5  Peex113Ctg00050_84666 106 72  17 195
## 6 Peex113Ctg00050_485230 106 72  17 195
mrks <- melt(mrk)
mrks <- mrks %>% filter(variable != "tot")

#check for exessive missings
mrk[which(mrk$NA. > 100),]
## [1] marker AX     EX     NA.    tot   
## <0 rows> (or 0-length row.names)
ggplot(data = mrks, aes(value, fill = variable)) +
  geom_density(alpha = 0.2)

missing data per individual

#missing data per individual
indv <- read.csv2("LM3_F7_K.INDIVIDUAL.genoSUMMARY.csv", header = T, sep = ",")
indvs <- melt(indv)
head(indv)
##   individual   AX   EX NA.  tot
## 1    RIL_196  604 1226  22 1852
## 2    RIL_197 1103  537 212 1852
## 3    RIL_198  456  994 402 1852
## 4    RIL_199 1343  475  34 1852
## 5    RIL_200  934  557 361 1852
## 6    RIL_201  690  730 432 1852
head(indvs)
##   individual variable value
## 1    RIL_196       AX   604
## 2    RIL_197       AX  1103
## 3    RIL_198       AX   456
## 4    RIL_199       AX  1343
## 5    RIL_200       AX   934
## 6    RIL_201       AX   690
indvs <- indvs %>% filter(variable != "tot")

#distribution of missings and ax and ex genotypes
ggplot(data = indvs, aes(value, variable, color = variable)) +
  geom_point(alpha = 0.2)

#ax vs ex
ggplot(data = indv, aes(AX, EX)) +
  geom_point(alpha = 0.2)

#missings
ggplot(data = indv, aes(NA.)) +
  geom_histogram(alpha = 0.2)

#remove those 4 individuals from the genetic map building

indv[which(indv$NA. > 1000),]
##    individual  AX  EX  NA.  tot
## 29     RIL_35  12  22 1818 1852
## 38     RIL_44   7  15 1830 1852
## 63     RIL_67  10  19 1823 1852
## 88     RIL_88 200 553 1099 1852
# 4 individuals removed and altered as LM3_F7_K.markers.clean.csv

Map building

f7_K <- read.cross(format = "csv", file = "LM3_F7_K.markers.clean.csv", F.gen = 7, genotypes = c("AX", "HET", "EX"))
##  --Read the following data:
##   191  individuals
##   1852  markers
##   1  phenotypes
##  --Cross type: bcsft
f7_K <- convert2riself(f7_K)

#save duplicates and corresponding markers in bins to reconstruct them later

duplicatesf7_K <- findDupMarkers(f7_K)

f7_dups <- do.call(rbind, lapply(seq_along(duplicatesf7_K), function(i){
  data.frame(bin_name=names(duplicatesf7_K)[i], duplicatesf7_K[[i]])}))
head(f7_dups)
##                 bin_name    duplicatesf7_K..i..
## 1  Peex113Ctg00050_84666 Peex113Ctg00050_485230
## 2 Peex113Ctg00050_485239 Peex113Ctg00050_485279
## 3  Peex113Ctg05141_86416  Peex113Ctg05141_86419
## 4 Peex113Ctg18048_296982 Peex113Ctg05145_130001
## 5 Peex113Ctg18048_296982 Peex113Ctg00860_156636
## 6 Peex113Ctg18048_296982 Peex113Ctg09060_154418
write.table(f7_dups,file = "LM3_F7_K.marker.dupbins1.txt", quote = FALSE, row.names = FALSE)

#remove duplicate markers
f7_K <- pullCross(f7_K, type = "co.located")

Generate inital map

#set bychr = FALSE to allow complete reconstruction of map 
map1 <- mstmap(f7_K, pop.tpye = "ARIL", bychr = F, dist.fun = "kosambi", trace = TRUE, detectBadData = T, p.value = 1e-09, , mvest.bc = TRUE, return.imputed = T)

#sort markers in each LG
map1 <- mstmap(map1, pop.tpye = "ARIL", bychr = T, dist.fun = "kosambi", trace = TRUE, detectBadData = T, p.value = 1e-09, , mvest.bc = TRUE, return.imputed = T)

#inspect results
summary(map1)
##     RI strains via selfing
## 
##     No. individuals:    191 
## 
##     No. phenotypes:     1 
##     Percent phenotyped: 100 
## 
##     No. chromosomes:    12 
##         Autosomes:      L.1 L.10 L.11 L.12 L.2 L.3 L.4 L.5 L.6 L.7 L.8 L.9 
## 
##     Total markers:      744 
##     No. markers:        182 3 1 1 82 93 79 144 75 70 4 10 
##     Percent genotyped:  89.8 
##     Genotypes (%):      AA:46.4  BB:53.6
geno.image(map1)

head(geno.table(map1))
##                        chr missing AA BB     P.value
## Peex113Ctg17884_167193 L.1      45 90 56 0.004895054
## Peex113Ctg18307_162364 L.1      30 75 86 0.385985052
## Peex113Ctg18308_61078  L.1      23 82 86 0.757620724
## Peex113Ctg18308_60876  L.1      24 81 86 0.698821641
## Peex113Ctg18307_162308 L.1      50 55 86 0.009036479
## Peex113Ctg17849_345854 L.1      29 77 85 0.529650670
#geno.image(map1, col = c("white", "darkorange", "lightblue"))
heatMap(map1, lmax = 25)

nmar(map1)
##  L.1 L.10 L.11 L.12  L.2  L.3  L.4  L.5  L.6  L.7  L.8  L.9 
##  182    3    1    1   82   93   79  144   75   70    4   10
summary(map1)
##     RI strains via selfing
## 
##     No. individuals:    191 
## 
##     No. phenotypes:     1 
##     Percent phenotyped: 100 
## 
##     No. chromosomes:    12 
##         Autosomes:      L.1 L.10 L.11 L.12 L.2 L.3 L.4 L.5 L.6 L.7 L.8 L.9 
## 
##     Total markers:      744 
##     No. markers:        182 3 1 1 82 93 79 144 75 70 4 10 
##     Percent genotyped:  89.8 
##     Genotypes (%):      AA:46.4  BB:53.6

Merging of unlinked markers

Expected 7 LGs in Petunia, therefore merge small LGs into a single one and try to push markers onto the 7 large LGs

#put all small LGs in to one
nmar(map1)
##  L.1 L.10 L.11 L.12  L.2  L.3  L.4  L.5  L.6  L.7  L.8  L.9 
##  182    3    1    1   82   93   79  144   75   70    4   10
row.names(as.data.frame(unlist(nmar(map1)[nmar(map1) < 20])))
## [1] "L.10" "L.11" "L.12" "L.8"  "L.9"
map2 <- mergeCross(map1, merge = list(
                       "UNKNOWN" =row.names(as.data.frame(unlist(nmar(map1)[nmar(map1) < 20])))
))
heatMap(map2)
## Warning in heatMap(map2): Running est.rf.

#try to assign them to chromosomes
#map2 <- pushCross(map2, type = "unlinked", unlinked.chr = "UNKNOWN" )

#map3 <- mstmap(map2, bychr = TRUE, trace = TRUE, anchor = TRUE, p.value= 2)
#summary(map3)
nmar(map1)
##  L.1 L.10 L.11 L.12  L.2  L.3  L.4  L.5  L.6  L.7  L.8  L.9 
##  182    3    1    1   82   93   79  144   75   70    4   10
# summary(map2)
#no improvement

names(map2$geno)[8] <- "L.8"

profileGen(map2, bychr = FALSE, stat.type = c("xo", "dxo", "miss"), id = "Genotype", xo.lambda = 25, layout = c(1, 3), lty = 2)

write.cross(format = "csv", map2, "F7_Kmap2.asmap")
?profileGen

Conversion to ABHgenotyper

python ~/Tools/repositories/GBS2map/Rqtl_2_abh.py -i F7_Kmap2.asmap.csv -o F7_Kmap2.abhIN.csv -q2a
## Extracting 744 markers from 191 individuals
## Finished.
## QTL to ABHGENOTYPER

ABH genotyper

#test 
library(ABHgenotypeR)
abh <- readABHgenotypes("F7_Kmap2.abhIN.csv", nameA = "A", nameB = "B")

plotGenos(abh)
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead

# NA's are mistakenly labelled as heterozygotes when heterozygotes are absent. Bug in abhgenotyper code

pes <- imputeByFlanks(abh)
## The absolute number of genotypes in inputGenos is:
##     A     B     N 
## 59261 68404 14439 
## 
## or in percentage
## 
##        A      B      H  N
## A 41.703 48.137 10.161 NA
## 
## The absolute number of genotypes in outputGenos is:
##     A     B     N 
## 64923 74568  2613 
## 
## or in percentage
## 
##        A      B     H  N
## A 45.687 52.474 1.839 NA
pes <- correctStretches(pes)
## The absolute number of genotypes in inputGenos is:
##     A     B     N 
## 64923 74568  2613 
## 
## or in percentage
## 
##        A      B     H  N
## A 45.687 52.474 1.839 NA
## 
## The absolute number of genotypes in outputGenos is:
##     A     B     N 
## 64995 74496  2613 
## 
## or in percentage
## 
##        A      B     H  N
## A 45.738 52.424 1.839 NA
pes <- imputeByFlanks(pes)
## The absolute number of genotypes in inputGenos is:
##     A     B     N 
## 64995 74496  2613 
## 
## or in percentage
## 
##        A      B     H  N
## A 45.738 52.424 1.839 NA
## 
## The absolute number of genotypes in outputGenos is:
##     A     B     N 
## 64995 74496  2613 
## 
## or in percentage
## 
##        A      B     H  N
## A 45.738 52.424 1.839 NA
reportGenos(pes)
## The absolute number of genotypes in pes is:
##     A     B     N 
## 64995 74496  2613 
## 
## or in percentage
## 
##        A      B     H  N
## A 45.738 52.424 1.839 NA
plotGenos(pes, chromToPlot = 1:7)
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead

# NOTE: 

# NA's are mistakenly labelled as heterozygotes when heterozygotes are absent. Bug in abhgenotyper code

writeABHgenotypes(pes, "F7_Kmap3.abhOUT.csv")

convert back to Rqtl format

python ~/Tools/repositories/GBS2map/Rqtl_2_abh.py -i F7_Kmap3.abhOUT.csv -o F7_Kmap3.asmapIN.csv -a2q
## Finished.
## ABHGENOTYPER to QTL

rebuild map from imputed and corrected markers in ASMap

map3 <- read.cross(format = "csv", file ="F7_Kmap3.asmapIN.csv", F.gen = 7, genotypes = c("A", "HET", "B"))
## Warning in read.cross.csv(dir, file, na.strings, genotypes, estimate.map, : The following unexpected genotype codes were treated as missing.
##     |N|
##  --Read the following data:
##   191  individuals
##   744  markers
##   1  phenotypes
## Warning in summary.cross(cross): Strange genotype pattern.
##  --Cross type: bcsft
map3 <- convert2riself(map3)
map3 <- pullCross(map3, type = "co.located")

#set bychr = FALSE to allow complete reconstruction of map 
map4 <- mstmap(map3, pop.tpye = "ARIL", bychr = F, dist.fun = "kosambi", trace = TRUE, detectBadData = T, p.value = 1e-09, , mvest.bc = TRUE, return.imputed = T)

Final map

summary(map4)
## Warning in summary.cross(map4): Some markers at the same position on chr L.
## 1,L.11,L.2,L.3,L.4,L.5,L.6,L.7,L.8; use jittermap().
##     RI strains via selfing
## 
##     No. individuals:    191 
## 
##     No. phenotypes:     1 
##     Percent phenotyped: 100 
## 
##     No. chromosomes:    11 
##         Autosomes:      L.1 L.10 L.11 L.2 L.3 L.4 L.5 L.6 L.7 L.8 L.9 
## 
##     Total markers:      417 
##     No. markers:        113 3 7 33 51 47 82 37 39 3 2 
##     Percent genotyped:  98.3 
##     Genotypes (%):      AA:47.3  BB:52.7
heatMap(map4)
## Warning in heatMap(map4): Running est.rf.

plot.map(map4)

profileGen(map4, bychr = FALSE, stat.type = c("xo", "dxo", "miss"), id = "Genotype", xo.lambda = 25, layout = c(1, 3), lty = 2)

profileMark(map4, stat.type = c("seg.dist", "dxo", "erf", "lod"), id = "Genotype", layout = c(1, 4), type = "l")
## Warning in summary.cross(cross): Some markers at the same position on chr
## L.1,L.11,L.2,L.3,L.4,L.5,L.6,L.7,L.8; use jittermap().

#crossover events dropped from 11 to 9
mean(countxo(map2))
## [1] 13.03141
mean(countxo(map4))
## [1] 9.685864
summaryMap(map4)
##         n.mar length ave.spacing max.spacing
## L.1       113  117.3         1.0         7.4
## L.10        3    0.3         0.1         0.1
## L.11        7    4.2         0.7         3.4
## L.2        33   28.1         0.9         6.6
## L.3        51   86.9         1.7        16.6
## L.4        47   58.5         1.3        11.8
## L.5        82   95.3         1.2        13.9
## L.6        37   53.4         1.5        10.7
## L.7        39   64.5         1.7        20.9
## L.8         3    0.0         0.0         0.0
## L.9         2    5.1         5.1         5.1
## overall   417  513.5         1.3        20.9